Install necessary packages.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
if (!requireNamespace("GEOmetadb", quietly = TRUE))
BiocManager::install("GEOmetadb")
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
if (!requireNamespace("edgeR", quietly = TRUE))
BiocManager::install("edgeR")
if (!requireNamespace("ggplot2", quietly = TRUE))
BiocManager::install("ggplot2")
if (!requireNamespace("reshape2", quietly = TRUE))
BiocManager::install("reshape2")
if (!requireNamespace("HGNChelper", quietly = TRUE))
BiocManager::install("HGNChelper")
Get the data for my chosen GEO id. I chose to work with the data set from an experiment that uses RNA-sequencing to study stem cell derived cortical excitatory (cExN) and cortical inhibitory (cIN) neurons from three individuals with varying affectation of autism syndrome disorder (ASD) (Lewis, Meganathan and Baldridge 2019).
myGEOID <- 'GSE129806'
suppFiles = GEOquery::getGEOSuppFiles(myGEOID)
Read the data in. There are 2 files provided one with the RPKM data and one with the raw counts data. I will work with the second file which has the counts data.
fileNames = rownames(suppFiles)
data <- read.delim(fileNames[2], check.names=FALSE, header=TRUE)
Check what the columns represent. They represent the cExN or cIN samples from the 4 individuals. “UC” prefix represents unaffected control group individual. “AP” prefix represents affected proband individual. “IS” prefix represents the intermediated phenotype sister. “UM” prefix respesents unaffected mother.
colnames(data)
## [1] "name" "length" "UC_cIN_R1" "UC_cIN_R2" "UC_cIN_R3"
## [6] "UC_cIN_R4" "AP_cIN_R1" "AP_cIN_R2" "AP_cIN_R3" "AP_cIN_R4"
## [11] "IS_cIN_R1" "IS_cIN_R2" "IS_cIN_R3" "IS_cIN_R4" "UM_cIN_R1"
## [16] "UM_cIN_R2" "UM_cIN_R3" "UM_cIN_R4" "UC_cExN_R1" "UC_cExN_R2"
## [21] "UC_cExN_R3" "UC_cExN_R4" "AP_cExN_R1" "AP_cExN_R2" "AP_cExN_R3"
## [26] "AP_cExN_R4" "IS_cExN_R1" "IS_cExN_R2" "IS_cExN_R3" "IS_cExN_R4"
## [31] "UM_cExN_R1" "UM_cExN_R2" "UM_cExN_R3" "UM_cExN_R4"
Rename column “name” to “gene_id” for more clarity.
colnames(data)[1] <- 'gene_id'
Remove any NAs.
na.omit(data)
See how many genes we have.
dim(data)
## [1] 56609 34
Take a look at the genes in the data set and check for duplicates.
sort(table(data$gene_id), decreasing = TRUE)[1:25]
##
## 1-Mar 2-Mar 1-Dec 1-Sep 10-Mar 10-Sep 11-Mar 11-Sep
## 2 2 1 1 1 1 1 1
## 12-Sep 14-Sep 2-Sep 3-Mar 3-Sep 4-Mar 4-Sep 5_8S_rRNA
## 1 1 1 1 1 1 1 1
## 5-Mar 5-Sep 5S_rRNA 6-Mar 6-Sep 7-Mar 7-Sep 7SK
## 1 1 1 1 1 1 1 1
## 8-Mar
## 1
Looking at the results, there are no duplicates. However, looks like there are gene symbols mistakenly converted to date by Excel. I will remove these dates as it’s unclear what genes they were converted from.
data <- data[!grepl('^[0-9]{1,2}[-][a-zA-Z]{3}$', data$gene_id),]
Check the row names to see what they are. Rename each row to the gene id.
rownames(data)[1:10]
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
rownames(data) <- data$gene_id
Check if the symbols are all HGNC or if they are up to date using HGNCHelper.
replacements <- HGNChelper::checkGeneSymbols(data$gene_id)
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Warning in HGNChelper::checkGeneSymbols(data$gene_id): Human gene symbols should
## be all upper-case except for the 'orf' in open reading frames. The case of some
## letters was corrected.
## Warning in HGNChelper::checkGeneSymbols(data$gene_id): x contains non-approved
## gene symbols
# Update the gene id with the newest HGNC symbols if available
data$gene_id <- ifelse(!replacements$Approved & !is.na(replacements$Suggested.Symbol), replacements$Suggested.Symbol, replacements$x)
See how many of the remaining gene_ids could not be mapped to a HGNC symbol. Some of these include long non-coding RNAs and pseudogenes.
not_mapped <- length(which(!HGNChelper::checkGeneSymbols(data$gene_id)$Approved))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Warning in HGNChelper::checkGeneSymbols(data$gene_id): Human gene symbols should
## be all upper-case except for the 'orf' in open reading frames. The case of some
## letters was corrected.
## Warning in HGNChelper::checkGeneSymbols(data$gene_id): x contains non-approved
## gene symbols
In the experiment, there were at least 3 or more biological replicates for each subject. I will use n=3 to perform removal of features without at least 1 read per million in 3 of the samples. Use edgeR package to calculate CPM and remove low counts.
cpms = edgeR::cpm(data[,3:34])
rownames(cpms) <- data[,1]
keep = rowSums(cpms > 1) >= 3
data_filtered = data[keep,]
rows_removed = dim(data)[1]-dim(data_filtered)[1]
Check the number of genes now.
dim(data_filtered)
## [1] 18474 34
Create a grouping from the data set.
# Based on code from lecture 4
samples <- data.frame(lapply(colnames(data)[3:34], FUN=function(x){unlist(strsplit(x, split="_"))[c(1,2)]}))
colnames(samples) = colnames(data)[3:34]
rownames(samples) = c("patients", "cell_type")
samples <- data.frame(t(samples))
I will apply TMM to the data set. I chose to use this method because most of the genes were not differentially expressed and that the RNA seq data does not to be modified.
# Based on code from lecture 4
data_filtered_matrix <- as.matrix(data_filtered[,3:34])
rownames(data_filtered_matrix) <- data_filtered$gene_id
d = edgeR::DGEList(counts=data_filtered_matrix, group=samples$cell_type)
Calculate the normalization factor and get it.
# Based on code from lecture 4
d = edgeR::calcNormFactors(d)
normalized_counts <- edgeR::cpm(d)
Modify the regular data set for plotting.
data_to_plot <- reshape2::melt((edgeR::cpm(data_filtered[,3:34], log=TRUE)))
colnames(data_to_plot) <- c("gene_id", "sample", "cpm")
Modify the normalized data set for plotting.
normalized_data_to_plot <- reshape2::melt(log2(normalized_counts))
colnames(normalized_data_to_plot) <- c("gene_id", "sample", "cpm")
Create a pre-normalized boxplot to visualize the data.
ggplot2::ggplot(data_to_plot, ggplot2::aes(x=sample, y=cpm)) +
ggplot2::geom_boxplot() +
ggplot2::theme(axis.text.x=ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggplot2::labs(title="Original cIN and cExN RNASeq Samples", y="log2-CPM")
Create a normalized boxplot to visualize the data.
ggplot2::ggplot(normalized_data_to_plot, ggplot2::aes(x=sample, y=cpm)) +
ggplot2::geom_boxplot() +
ggplot2::theme(axis.text.x=ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggplot2::labs(title="Normalized cIN and cExN RNASeq Samples", y="log2-CPM")
## Warning: Removed 1965 rows containing non-finite values (stat_boxplot).
Create a pre-normalized density plot to visualize the data.
ggplot2::ggplot(data_to_plot, ggplot2::aes(x=cpm, color=sample)) +
ggplot2::geom_density() +
ggplot2::labs(title="Original cIN and cExN RNASeq Samples", y="density of log2-CPM", x="")
Create a normalized density plot to visualize the data.
ggplot2::ggplot(normalized_data_to_plot, ggplot2::aes(x=cpm, color=sample)) +
ggplot2::geom_density() +
ggplot2::labs(title="Normalized cIN and cExN RNASeq Samples", y="density of log2-CPM", x="")
## Warning: Removed 1965 rows containing non-finite values (stat_density).
The density graph of the data set looks like it follows a bimodal distribution.
Looking at the data set and the paper, the test condition is the cortical excitatory (cExN) and cortical inhibitory (cIN) neurons from the three first-degree relatives with absent, intermediate and severe expressivity of autism syndrome disorder (ASD). The control condition is the cExN and cIN neurons from the unrelated individual. The samples of the three first-degree relatives are prefixed with UM (Unaffected mother), IS (Intermediated Sister) and AP (Affected proband). The sample of the unrelated individual is prefixed with UC (Unrelated, unaffected control).
This data set is interesting to me because I am currently working in a lab that does research related to autism spectrum disorder.
There weren’t multiple expression values that mapped to a single gene.
Yes there were 19865 expression values that could not be mapped to a current HUGO symbol.
38108 outliers were removed.
Excluding the control cell line, there were 3 biological replicates from each of the two clonal lines for each test subject. For now, I decided to keep the replicates separate and analyze them independently.
There was approximately 30 million reads per sample according to the paper (Lewis, Meganathan and Baldridge 2019)